Background: Hi! This is my journey of navigating RStudio and becoming an overall more knowledgeable data analyst / scientist. Here I will create a variety of storytelling visuals with RStudio’s built-in datasets.

Iris

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(datasets)
library(tidyverse)
library(ggthemes)
library(tinytex)
library(prettydoc) 
windowsFonts(Times = windowsFont("Times New Roman"))
library(ggthemes)

?datasets library(help = “datasets”) ?iris

iris head(iris)

#basic scatterplot

qplot(Petal.Width, Petal.Length, data=iris)

#basic scatterplot w/ species colored

qplot(Petal.Width, Petal.Length, color=Species, data=iris)

#same thing as above, but strictly ggplot2

ggplot(iris, aes(Petal.Width, Petal.Length, color=Species))+
      geom_point()

#size matches sepal length. jitter separates dots slightly for viewing. alpha makes dots more transparent

ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Sepal.Length))+
  geom_jitter(alpha=0.5)

#size matches sepal length. jitter separates dots slightly for viewing. alpha makes dots more transparent. geom_smooth makes regression line with std. error

ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Sepal.Length))+
  geom_point(size=2)+
  geom_smooth(method=lm)+
  theme(legend.position = "right")

#histogram

ggplot(iris, aes(Petal.Length, fill=Species))+
  geom_histogram()+
  theme(legend.position = "right")

#histogram w/ indiv. plots for species w/ facet_grid

ggplot(iris, aes(Petal.Length, fill=Species))+
  geom_histogram()+
  facet_grid(Species ~ .)

  theme(legend.position = "none")
## List of 1
##  $ legend.position: chr "none"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

#histogram w/ indiv. plots for species w/ facet_grid (geom_density is cleaner, less blocky)

ggplot(iris, aes(Petal.Length, fill=Species))+
  geom_density(alpha=0.5)+
  facet_grid(Species ~ .)

theme(legend.position = "none")
## List of 1
##  $ legend.position: chr "none"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

#facet_grid with previous scatterplots

ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Sepal.Length))+
  geom_point(size=2)+
  geom_smooth(method=lm)+
  facet_grid(Species ~ .)

  theme(legend.position = "right")
## List of 1
##  $ legend.position: chr "right"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

#Key Figures

windowsFonts(Times = windowsFont(“Times New Roman”)) library(ggthemes)

ggplot(iris, aes(Sepal.Width, Sepal.Length, color=Species, size=Sepal.Length))+
  geom_point(size=2)+
  geom_smooth(method=lm)+
  theme_fivethirtyeight(base_size = 12, base_family = "Times")+
  labs(title="Correlation Between Sepal Width and Sepal Length of Iris Species",
       caption="Data were collected by Anderson, Edgar (1935).")+
  labs(x = "Sepal Width")+
  labs(y = "Sepal Length")+
  theme(legend.position = "right")

#SEPAL WIDTH VS. SEPAL LENGTH w/out theme

ggplot(iris, aes(Sepal.Width, Sepal.Length, color=Species, size=Sepal.Length))+
  geom_point(size=2)+
  geom_smooth(method=lm)+
  labs(title="Correlation Between Sepal Width and Sepal Length of Iris Species",
       caption="Data were collected by Anderson, Edgar (1935).")+
  labs(x = "Sepal Width (cm)")+
  labs(y = "Sepal Length (cm)")+
  xlim(2.0, 4.5)+
  ylim(0.0, 10.0)+
  theme(legend.position = "bottom")

#SEPAL WIDTH VS. SEPAL LENGTH w/ theme

ggplot(iris, aes(Sepal.Width, Sepal.Length, color=Species, size=Sepal.Length))+
  geom_point(size=2)+ 
  geom_smooth(method=lm)+
  theme_fivethirtyeight(base_size = 12, base_family = "Times")+
  labs(title="Correlation Between Sepal Width and Sepal Length of Iris Species",
       caption="Data were collected by Anderson, Edgar (1935).")+
  theme(axis.title = element_text()) + ylab('Sepal Length (cm)') + xlab('Sepal Width (cm)')+
  xlim(2.0, 4.5)+
  ylim(0.0, 10.0)+
  theme(legend.position = "bottom")

#PETAL WIDTH VS. PETAL LENGTH w/out theme

ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Petal.Length))+
  geom_point(size=2)+
  geom_smooth(method=lm)+
  labs(title="Correlation Between Petal Width and Petal Length of Iris Species",
       caption="Data were collected by Anderson, Edgar (1935).")+
  labs(x = "Petal Width (cm)")+
  labs(y = "Petal Length (cm)")+
  xlim(0.0, 3.0)+
  ylim(0.0, 8.0)+
  theme(legend.position = "bottom")

#PETAL WIDTH VS. PETAL LENGTH w/ theme & facet

ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Petal.Length))+
  geom_point(size=2)+
  geom_smooth(method=lm)+
  labs(title="Correlation Between Petal Width and Petal Length of Iris Species",
       caption="Data were collected by Anderson, Edgar (1935).")+
  labs(x = "Petal Width (cm)")+
  labs(y = "Petal Length (cm)")+
  xlim(0.0, 3.0)+
  ylim(0.0, 8.0)+
  facet_grid(Species ~ .)+
  theme(legend.position = "right")

#PETAL WIDTH VS. PETAL LENGTH w/ theme

ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Petal.Length))+
  geom_point(size=2)+ 
  geom_smooth(method=lm)+
  theme_fivethirtyeight(base_size = 12, base_family = "Times")+
  labs(title="Correlation Between Petal Width and Petal Length of Iris Species",
       caption="Data were collected by Anderson, Edgar (1935).")+
  theme(axis.title = element_text()) + ylab('Petal Length (cm)') + xlab('Petal Width (cm)')+
  xlim(0.0, 3.0)+
  ylim(0.0, 8.0)+
  theme(legend.position = "bottom")

#PETAL WIDTH VS. PETAL LENGTH w/ theme & facet

ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Petal.Length))+
  geom_point(size=2)+ 
  geom_smooth(method=lm)+
  theme_fivethirtyeight(base_size = 12, base_family = "Times")+
  labs(title="Correlation Between Petal Width and Petal Length of Iris Species",
       caption="Data were collected by Anderson, Edgar (1935).")+
  theme(axis.title = element_text()) + ylab('Petal Length (cm)') + xlab('Petal Width (cm)')+
  xlim(0.0, 3.0)+
  ylim(0.0, 8.0)+
  facet_grid(Species ~ .)+
  theme(legend.position = "right")

#Boxplot NO ggplot2 Sepal Length

boxplot(Sepal.Length~Species,
        data=iris, 
        main='Sepal Length by Species',
        sub='Data were collected by Anderson, Edgar (1935).',
        xlab='Species',
        ylab='Sepal Length (cm)',
        col='darkgreen',
        border='black')

#Boxplot w/ ggplot2 Sepal Length

ggplot(iris, aes(x=Species, y=Sepal.Length, fill=Species)) + 
  geom_boxplot(width=0.5,lwd=1)+
  labs(title="Sepal Length by Species", x = "Species", y = "Sepal Length (cm)",  
       caption="Data were collected by Anderson, Edgar (1935).")

#Boxplot w/ ggplot2 Sepal Width

ggplot(iris, aes(x=Species, y=Sepal.Width, fill=Species)) + 
  geom_boxplot(width=0.5,lwd=1)+
  labs(title="Sepal Length by Species", x = "Species", y = "Sepal Width (cm)",  
       caption="Data were collected by Anderson, Edgar (1935).")

#Boxplot w/ ggplot2 Petal Length

ggplot(iris, aes(x=Species, y=Petal.Length, fill=Species)) + 
  geom_boxplot(width=0.5,lwd=1)+
  labs(title="Sepal Length by Species", x = "Species", y = "Petal Length (cm)",  
       caption="Data were collected by Anderson, Edgar (1935).")

#Boxplot w/ ggplot2 Petal Width

ggplot(iris, aes(x=Species, y=Petal.Width, fill=Species)) + 
  geom_boxplot(width=0.5,lwd=1)+
  labs(title="Sepal Length by Species", x = "Species", y = "Petal Width (cm)",  
       caption="Data were collected by Anderson, Edgar (1935).")

AirPassengers

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(forecast) #for autoplot with geom_smooth linear regression
library(tseries) #for autoplot with geom_smooth linear regression

?datasets library(help = “datasets”) ?AirPassengers

AirPassengers head(AirPassengers)

#this dataset does not have column names, but that won’t stop us

class(AirPassengers) #ts is the output = this means the data is a time series start(AirPassengers) #shows first year end(AirPassengers) #shows last year

plot(AirPassengers, xlab="Year", ylab="Air Passengers (1,000s)", main="Time Series of Air Passengers", 
     sub="The classic Box & Jenkins airline data. Monthly totals of international airline passengers, 1949 to 1960.")

boxplot(AirPassengers~cycle(AirPassengers),xlab="Date", ylab = "Air Passengers (1,000s)" ,main ="Monthly Air Passengers Boxplot from 1949 to 1961")

autoplot(AirPassengers) + 
  geom_smooth(method="lm")+ 
  labs(x ="Date", y = "Air Passengers (1,000s)", title="Air Passengers from 1949 to 1961") 

ARIMA is an acronym for “autoregressive integrated moving average.” It’s a model used in statistics and econometrics to measure events that happen over a period of time.

arimaAirPassengers <- auto.arima(AirPassengers)
arimaAirPassengers
## Series: AirPassengers 
## ARIMA(2,1,1)(0,1,0)[12] 
## 
## Coefficients:
##          ar1     ar2      ma1
##       0.5960  0.2143  -0.9819
## s.e.  0.0888  0.0880   0.0292
## 
## sigma^2 = 132.3:  log likelihood = -504.92
## AIC=1017.85   AICc=1018.17   BIC=1029.35

If the residuals plot is around 0 w/ no movement, then ARIMA model is a good fit

ggtsdiag(arimaAirPassengers)

95% confidence looking 48 months into the future

forecastAirPassengers <- forecast(arimaAirPassengers, level = c(95), h = 48)
autoplot(forecastAirPassengers)

BJsales

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(tseries) #for autoplot with geom_smooth linear regression
library(forecast) #for autoplot with geom_smooth linear regression

?datasets library(help = “datasets”) ?BJsales

BJsales head(BJsales)

class(BJsales) #ts is the output = this means the data is a time series start(BJsales) #shows first year end(BJsales) #shows last year

plot(BJsales, xlab="Time", ylab="Sales", main="Sales ", col.main="SteelBlue",
     sub="The data are given in Box & Jenkins (1976). Obtained from the Time Series Data Library at https://robjhyndman.com/TSDL/")

boxplot(BJsales~cycle(BJsales),xlab=“Date (unknown scale)”, ylab = “Sales (unknown scale)” ,main =“Sales Over Time”)

autoplot(BJsales) + 
  geom_smooth(method="lm")+ 
  labs(x ="Date (unknown scale)", y = "Sales (unknown scale)", title="Sales Over Time") 

#ARIMA is an acronym for “autoregressive integrated moving average.” It’s a model used in statistics and econometrics to measure events that happen over a period of time.

arimaBJsales <- auto.arima(BJsales)
arimaBJsales
## Series: BJsales 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.8800  -0.6415
## s.e.  0.0644   0.1035
## 
## sigma^2 = 1.8:  log likelihood = -254.37
## AIC=514.74   AICc=514.9   BIC=523.75

#if the residuals plot is around - w/ no movement, then ARIMA model is a good fit

ggtsdiag(arimaBJsales)

#95% confidence looking 10 months into the future

forecastBJsales <- forecast(arimaBJsales, level = c(95), h = 10)
autoplot(forecastBJsales)

forecastedJsales <- forecast(arimaBJsales, 10)
plot(forecastedJsales, main="Forecast of Sales Over Time", col.main="SteelBlue", ylab = "Sales (unknown scale)", xlab="Date (unknown scale)")

BOD

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)

?datasets library(help = “datasets”) ?BOD

BOD head(BOD)

class(BOD) #data frame

#same thing as basic ggplot

qplot(Time, demand, data=BOD)

ggplot(BOD, aes(Time, demand))+
  geom_point()

ggplot(BOD, aes(Time, demand))+
  geom_line()+
  expand_limits(y=0, x=0)+
  labs(title="Biochemical Oxygen Demand Versus Time", x = "Time (days)", y = "Biochemical Oxygen Demand (mg/l)",  
       caption="Bates, D.M. and Watts, D.G. (1988), Nonlinear Regression Analysis and Its Applications, Wiley, Appendix A1.4.")+
  theme(plot.title = element_text(color = "SteelBlue"))

CO2

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)

?datasets library(help = “datasets”) ?CO2

CO2 head(CO2)

class(CO2) #data frame

same thing as basic ggplot

qplot(uptake, conc, data=CO2, fill=Type)

ggplot(CO2, aes(x=Treatment, y=uptake)) + 
  geom_boxplot()

t.test(CO2$uptake ~ CO2$Treatment)
## 
##  Welch Two Sample t-test
## 
## data:  CO2$uptake by CO2$Treatment
## t = 3.0485, df = 80.945, p-value = 0.003107
## alternative hypothesis: true difference in means between group nonchilled and group chilled is not equal to 0
## 95 percent confidence interval:
##   2.382366 11.336682
## sample estimates:
## mean in group nonchilled    mean in group chilled 
##                 30.64286                 23.78333
ggplot(CO2, aes(x=Type, y=uptake)) + 
  geom_boxplot()

t.test(CO2$uptake ~ CO2$Type)
## 
##  Welch Two Sample t-test
## 
## data:  CO2$uptake by CO2$Type
## t = 6.5969, df = 78.533, p-value = 4.451e-09
## alternative hypothesis: true difference in means between group Quebec and group Mississippi is not equal to 0
## 95 percent confidence interval:
##   8.839475 16.479572
## sample estimates:
##      mean in group Quebec mean in group Mississippi 
##                  33.54286                  20.88333

set the plotting area into a 1*2 array

par(mfrow = c(1,2)) 
boxplot(CO2$uptake ~ CO2$Treatment)
boxplot(CO2$uptake ~ CO2$Type)

set the plotting area into a 1*1 array

par(mfrow = c(1,1))
boxplot(CO2$uptake ~ CO2$Type + CO2$Treatment)

par(mfrow = c(1,1))
boxplot(subset(CO2$uptake, CO2$conc == 95), 
        subset(CO2$uptake, CO2$conc == 175),
        subset(CO2$uptake, CO2$conc == 250),
        subset(CO2$uptake, CO2$conc == 350),
        subset(CO2$uptake, CO2$conc == 500),
        subset(CO2$uptake, CO2$conc == 675),
        subset(CO2$uptake, CO2$conc == 1000), 
        names = c("95", "175", "250", "350", "500", "675", "1000"))

par(mfrow = c(2,2))
plot(uptake ~ conc, data = subset(CO2,Type == "Quebec" & Treatment == "chilled"), 
     main="Chilled Quebec", xlab = "Concentration (ML/L)", ylab = "Carbon Dioxide Uptake (μmol/m2s)" )
plot(uptake ~ conc, data = subset(CO2,Type == "Mississippi" & Treatment == "chilled"), 
     main="Chilled Mississippi", xlab = "Concentration (ML/L)", ylab = "Carbon Dioxide Uptake (μmol/m2s)")
plot(uptake ~ conc, data = subset(CO2,Type == "Quebec" & Treatment == "nonchilled"),
     main="Nonchilled Quebec", xlab = "Concentration (ML/L)", ylab = "Carbon Dioxide Uptake (μmol/m2s)")
plot(uptake ~ conc, data = subset(CO2,Type == "Mississippi" & Treatment == "nonchilled"), 
     main="Nonchilled Mississippi", xlab = "Concentration (ML/L)", ylab = "Carbon Dioxide Uptake (μmol/m2s)")

ggplot(CO2, aes(conc, uptake, color=Treatment, shape=Type)) + 
  geom_point()+
  expand_limits(y=0, x=0)+
  labs(title="CO2 Uptake of Plants from Mississippi and Quebec", x = "Concentration (ML/L)", y = "Carbon Dioxide Uptake (μmol/m2s)",  
       caption="The CO2 data frame has 84 rows and 5 columns of data from an experiment on the cold tolerance of the grass species Echinochloa crus-galli.")

ggplot(CO2, aes(conc, uptake, color=Plant, type=Treatment)) + 
  geom_line()+
  geom_point()+
  expand_limits(y=0, x=0)+
  labs(title="CO2 Uptake of Plants from Mississippi and Quebec", x = "Concentration (ML/L)", y = "Carbon Dioxide Uptake (μmol/m2s)",  
       caption="The CO2 data frame has 84 rows and 5 columns of data from an experiment on the cold tolerance of the grass species Echinochloa crus-galli.")

ChickWeight

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)

?datasets library(help = “datasets”) ?ChickWeight

ChickWeight head(ChickWeight)

class(ChickWeight) #data frame

The body weights of the chicks were measured at birth and every second day thereafter until day 20. They were also measured on day 21. There were four groups on chicks on different protein diets.

tail(ChickWeight)

chick0 <- ChickWeight[ChickWeight$Time %in% 0:5, ]
chick1 <- ChickWeight[ChickWeight$Time %in% 6:10, ]
chick2 <- ChickWeight[ChickWeight$Time %in% 11:15, ]
chick3 <- ChickWeight[ChickWeight$Time %in% 16:21, ]

chick0 chick1 chick2 chick3

par(mfrow = c(2,2)) 
boxplot(chick0$weight ~ chick0$Diet, 
        main = "Chick Weight By Diet Type (0-5 Days)",
        xlab = "Diet Types",
        ylab = "Weight",
        ylim = c(0, 300))
boxplot(chick1$weight ~ chick1$Diet, 
        main = "Chick Weight By Diet Type (6-10 Days)",
        xlab = "Diet Types",
        ylab = "Weight",
        ylim = c(0, 300))
boxplot(chick2$weight ~ chick2$Diet, 
        main = "Chick Weight By Diet Type (11-15 Days)",
        xlab = "Diet Types",
        ylab = "Weight",
        ylim = c(0, 300))
boxplot(chick3$weight ~ chick3$Diet, 
        main = "Chick Weight By Diet Type (16-21 Days)",
        xlab = "Diet Types",
        ylab = "Weight",
        ylim = c(0, 300))

chickDiet1 <- ChickWeight[ChickWeight$Diet == 1, ]
chickDiet2 <- ChickWeight[ChickWeight$Diet == 2, ]
chickDiet3 <- ChickWeight[ChickWeight$Diet == 3, ]
chickDiet4 <- ChickWeight[ChickWeight$Diet == 4, ]

chickDiet1 chickDiet2 chickDiet3 chickDiet4

par(mfrow = c(2,2)) 
boxplot(chickDiet1$weight ~ chickDiet1$Time, 
        main = "Chick Weight By Diet Type (1)",
        xlab = "Days",
        ylab = "Weight (gm)",
        ylim = c(0, 300))
boxplot(chickDiet2$weight ~ chickDiet2$Time, 
        main = "Chick Weight By Diet Type (2)",
        xlab = "Days",
        ylab = "Weight (gm)",
        ylim = c(0, 300))
boxplot(chickDiet3$weight ~ chickDiet3$Time, 
        main = "Chick Weight By Diet Type (3)",
        xlab = "Days",
        ylab = "Weight (gm)",
        ylim = c(0, 300))
boxplot(chickDiet4$weight ~ chickDiet4$Time, 
        main = "Chick Weight By Diet Type (4)",
        xlab = "Days",
        ylab = "Weight (gm)",
        ylim = c(0, 300))

same thing as basic ggplot

qplot(Time, weight, data=ChickWeight, color=factor(Diet), facets = Diet ~ .,
      geom=c("point", "smooth"), 
      main="Protein Diet Versus Chick Weight ", xlab = "Time (days)", ylab = "Chick Weight (gm)" )
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(ChickWeight, aes(x=Diet, y=weight)) + 
  geom_boxplot()

DNase

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)

?datasets library(help = “datasets”) ?DNase

The DNase data frame has 176 rows and 3 columns of data obtained during development of an ELISA assay for the recombinant protein DNase in rat serum.

DNase head(DNase)

class(DNase) #data frame

Looking at distribution of data

par(mfrow = c(1,1))
hist(DNase$density, breaks=10, main = "")

boxplot(density ~ Run, data = DNase)

Plotting outliers

par(mfrow = c(1,1))
boxplot(DNase$density~DNase$conc,xlab="Concentration of Protein (ng/ml)",ylab="Optical Density")

Finding outliers

boxplot.stats(DNase$density[DNase$conc==12.5])$out
## [1] 1.932 1.914 2.003 1.884
boxplot.stats(DNase$density[DNase$conc==6.25])$out
## [1] 1.629

Removing outliers

df_no_outliers <- subset(x=DNase, !density %in% boxplot.stats(DNase$density[DNase$conc==12.5])$out & !density %in% boxplot.stats(DNase$density[DNase$conc==6.25])$out ) 

Final Graph

ggplot(df_no_outliers, aes(x=conc, y=density)) + 
  geom_smooth()+
  geom_point()+
  expand_limits(y=0, x=0)+
  labs(title="Recombinant Protein DNase ELISA Assay Density Versus Concentration", x = "Concentration of Protein (ng/ml)", y = "Optical Density",  
       caption="The DNase data frame has 176 rows and 3 columns of data obtained during development of an ELISA assay for the recombinant protein DNase in rat serum.")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

EUStockMarkets

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
library(broom)
library(dplyr)

?datasets library(help = “datasets”) ?EuStockMarkets

Contains the daily closing prices of major European stock indices: Germany DAX (Ibis), Switzerland SMI, France CAC, and UK FTSE. The data are sampled in business time, i.e., weekends and holidays are omitted.

EuStockMarkets head(EuStockMarkets)

class(EuStockMarkets) #timeseries

EuStockDF<-as.data.frame(EuStockMarkets) 
EuStockDF$Date <- as.numeric(time(EuStockMarkets)) 

now the dates have an official column head(EuStockDF)

ggplot(EuStockDF,aes(x = Date))+
  expand_limits(y=c(2000, 9000))+
  labs(title="European Stock Indices 1991 to 1998", x = "Year", y = "Stock Price",  
       caption="Contains the daily closing prices of major European stock indices: Germany DAX (Ibis), Switzerland SMI,               France CAC,and UK FTSE. The data are sampled in business time, i.e., weekends and holidays are omitted.")+
  geom_line(aes(y = DAX), color="darkslateblue")+
  geom_line(aes(y = SMI), color="darkorchid")+
  geom_line(aes(y = CAC), color="mediumseagreen")+
  geom_line(aes(y = FTSE), color="indianred3")

tidyEuStockMarkets <- tidy(EuStockMarkets)
tidyEuStockMarkets <- tidyEuStockMarkets %>%
  rename(Date=index,Stock_Index=series,Price=value)
tidyEuStockMarkets
## # A tibble: 7,440 x 3
##     Date Stock_Index Price
##    <dbl> <chr>       <dbl>
##  1 1991. DAX         1629.
##  2 1991. SMI         1678.
##  3 1991. CAC         1773.
##  4 1991. FTSE        2444.
##  5 1992. DAX         1614.
##  6 1992. SMI         1688.
##  7 1992. CAC         1750.
##  8 1992. FTSE        2460.
##  9 1992. DAX         1607.
## 10 1992. SMI         1679.
## # ... with 7,430 more rows
## # i Use `print(n = ...)` to see more rows
ggplot(tidyEuStockMarkets,aes(x=Date,y=Price))+
  geom_line(aes(color=Stock_Index))+
  expand_limits(y=c(2000, 10000))+
  labs(title="European Stock Indices 1991 to 1998", x = "Year", y = "Stock Price",  
       caption="Contains the daily closing prices of major European stock indices: Germany DAX (Ibis), Switzerland SMI,           France CAC,and UK FTSE. The data are sampled in business time, i.e., weekends and holidays are omitted.")

HairEyeColor

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)

?datasets library(help = “datasets”) ?HairEyeColor

#Distribution of hair and eye color and sex in 592 statistics students.

HairEyeColor head(HairEyeColor)

class(HairEyeColor) #table

ggplot(HairEyeColor, aes(Eye, Freq, fill=Sex))+
    geom_boxplot()

ggplot(HairEyeColor, aes(Hair, Freq, fill=Sex))+
  geom_boxplot()

ggplot(HairEyeColor, aes(Hair, fill=Hair))+
  scale_fill_manual(values=c("#1C1504", "#5E4303", "#F07743", "#FFFBF1"))+
  geom_density(alpha=0.7)

ggplot(HairEyeColor, aes(Eye, fill=Eye,))+
  scale_fill_manual(values=c("#3B1002", "cyan3", "#93820B", "chartreuse4"))+
  geom_density(alpha=0.7)

library(vcd)
## Loading required package: grid
mosaic(HairEyeColor, shade = TRUE)

HairEyeColorchi <- HairEyeColor[,,"Male"] + HairEyeColor[,,"Female"]
HairEyeColorchi
##        Eye
## Hair    Brown Blue Hazel Green
##   Black    68   20    15     5
##   Brown   119   84    54    29
##   Red      26   17    14    14
##   Blond     7   94    10    16
#p-value < 2.2e-16 significant
chisq.test(HairEyeColorchi)
## 
##  Pearson's Chi-squared test
## 
## data:  HairEyeColorchi
## X-squared = 138.29, df = 9, p-value < 2.2e-16

HairEyeColorchi/sum

barplot(HairEyeColorchi, beside=TRUE, legend.text=TRUE, xlab="Eye Color", ylab="Frequency", main="Eye Color vs. Hair Color", 
        col=c("#1C1504", "#5E4303", "#F07743", "#FFFBF1")) 

HairEyeColor

#2 means columns, 1 means rows
MH <- apply(HairEyeColor[,,"Male"],1,sum)
MH
## Black Brown   Red Blond 
##    56   143    34    46
FH <- apply(HairEyeColor[,,"Female"],1,sum)
FH
## Black Brown   Red Blond 
##    52   143    37    81
HairEyeColorchi2 <- cbind(MH,FH)
HairEyeColorchi2
##        MH  FH
## Black  56  52
## Brown 143 143
## Red    34  37
## Blond  46  81
#p-value = 0.04613 somehwta significant 
chisq.test(HairEyeColorchi2)
## 
##  Pearson's Chi-squared test
## 
## data:  HairEyeColorchi2
## X-squared = 7.9942, df = 3, p-value = 0.04613
barplot(HairEyeColorchi2, beside=TRUE, legend.text=TRUE, xlab="Gender", ylab="Frequency", main="Gender vs. Hair Color", 
        names.arg=c("Male","Female"), col=c("#1C1504", "#5E4303", "#F07743", "#FFFBF1"))

HairEyeColor

#2 means columns, 1 means rows
ME <- apply(HairEyeColor[,,"Male"],2,sum)
FE <- apply(HairEyeColor[,,"Female"],2,sum)
HairEyeColorchi3 <- cbind(ME,FE)

HairEyeColorchi3
##        ME  FE
## Brown  98 122
## Blue  101 114
## Hazel  47  46
## Green  33  31
#p-value = 0.6754 not significant 
chisq.test(HairEyeColorchi3)
## 
##  Pearson's Chi-squared test
## 
## data:  HairEyeColorchi3
## X-squared = 1.5298, df = 3, p-value = 0.6754
barplot(HairEyeColorchi3, beside=TRUE, legend.text=TRUE, xlab="Gender", ylab="Frequency", main="Gender vs. Eye Color",
        names.arg=c("Male","Female"), col=c("#3B1002", "cyan3", "#93820B", "chartreuse4"))

Harman23.cor

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
library(ggcorrplot)

?datasets library(help = “datasets”) ?Harman23.cor

A correlation matrix of eight physical measurements on 305 girls between ages seven and seventeen.

Harman23.cor
## $cov
##                height arm.span forearm lower.leg weight bitro.diameter
## height          1.000    0.846   0.805     0.859  0.473          0.398
## arm.span        0.846    1.000   0.881     0.826  0.376          0.326
## forearm         0.805    0.881   1.000     0.801  0.380          0.319
## lower.leg       0.859    0.826   0.801     1.000  0.436          0.329
## weight          0.473    0.376   0.380     0.436  1.000          0.762
## bitro.diameter  0.398    0.326   0.319     0.329  0.762          1.000
## chest.girth     0.301    0.277   0.237     0.327  0.730          0.583
## chest.width     0.382    0.415   0.345     0.365  0.629          0.577
##                chest.girth chest.width
## height               0.301       0.382
## arm.span             0.277       0.415
## forearm              0.237       0.345
## lower.leg            0.327       0.365
## weight               0.730       0.629
## bitro.diameter       0.583       0.577
## chest.girth          1.000       0.539
## chest.width          0.539       1.000
## 
## $center
## [1] 0 0 0 0 0 0 0 0
## 
## $n.obs
## [1] 305
head(Harman23.cor)
## $cov
##                height arm.span forearm lower.leg weight bitro.diameter
## height          1.000    0.846   0.805     0.859  0.473          0.398
## arm.span        0.846    1.000   0.881     0.826  0.376          0.326
## forearm         0.805    0.881   1.000     0.801  0.380          0.319
## lower.leg       0.859    0.826   0.801     1.000  0.436          0.329
## weight          0.473    0.376   0.380     0.436  1.000          0.762
## bitro.diameter  0.398    0.326   0.319     0.329  0.762          1.000
## chest.girth     0.301    0.277   0.237     0.327  0.730          0.583
## chest.width     0.382    0.415   0.345     0.365  0.629          0.577
##                chest.girth chest.width
## height               0.301       0.382
## arm.span             0.277       0.415
## forearm              0.237       0.345
## lower.leg            0.327       0.365
## weight               0.730       0.629
## bitro.diameter       0.583       0.577
## chest.girth          1.000       0.539
## chest.width          0.539       1.000
## 
## $center
## [1] 0 0 0 0 0 0 0 0
## 
## $n.obs
## [1] 305
class(Harman23.cor) #list
## [1] "list"

Exporting file to csv so I can manipulate it to work more easily with corrplot getwd() write.csv(Harman23.cor,“C:/Users/Siggi/Downloads/Harman23.cor.csv”, row.names = FALSE)

Harman23DF <- as.data.frame(Harman23.cor,  col.names = c("", "", ""))#dataframe now
Harman23DF #dataframe now so ggcorrplot will work
##                height arm.span forearm lower.leg weight bitro.diameter
## height          1.000    0.846   0.805     0.859  0.473          0.398
## arm.span        0.846    1.000   0.881     0.826  0.376          0.326
## forearm         0.805    0.881   1.000     0.801  0.380          0.319
## lower.leg       0.859    0.826   0.801     1.000  0.436          0.329
## weight          0.473    0.376   0.380     0.436  1.000          0.762
## bitro.diameter  0.398    0.326   0.319     0.329  0.762          1.000
## chest.girth     0.301    0.277   0.237     0.327  0.730          0.583
## chest.width     0.382    0.415   0.345     0.365  0.629          0.577
##                chest.girth chest.width c.0..0..0..0..0..0..0..0. X305
## height               0.301       0.382                         0  305
## arm.span             0.277       0.415                         0  305
## forearm              0.237       0.345                         0  305
## lower.leg            0.327       0.365                         0  305
## weight               0.730       0.629                         0  305
## bitro.diameter       0.583       0.577                         0  305
## chest.girth          1.000       0.539                         0  305
## chest.width          0.539       1.000                         0  305
class(Harman23DF)
## [1] "data.frame"

Anything correlated to itself will be 1 so ignore middle diagonal and anything paired to itself

ggcorrplot(Harman23DF, hc.order = TRUE, outline.col = "white")
## Warning in as.dist.default((1 - cormat)/2): non-square matrix

library(corrplot)
## corrplot 0.92 loaded
#removing unnecessary columns
Harman23DF$c.0..0..0..0..0..0..0..0. <- NULL
Harman23DF$X305 <- NULL
Harman23DF
##                height arm.span forearm lower.leg weight bitro.diameter
## height          1.000    0.846   0.805     0.859  0.473          0.398
## arm.span        0.846    1.000   0.881     0.826  0.376          0.326
## forearm         0.805    0.881   1.000     0.801  0.380          0.319
## lower.leg       0.859    0.826   0.801     1.000  0.436          0.329
## weight          0.473    0.376   0.380     0.436  1.000          0.762
## bitro.diameter  0.398    0.326   0.319     0.329  0.762          1.000
## chest.girth     0.301    0.277   0.237     0.327  0.730          0.583
## chest.width     0.382    0.415   0.345     0.365  0.629          0.577
##                chest.girth chest.width
## height               0.301       0.382
## arm.span             0.277       0.415
## forearm              0.237       0.345
## lower.leg            0.327       0.365
## weight               0.730       0.629
## bitro.diameter       0.583       0.577
## chest.girth          1.000       0.539
## chest.width          0.539       1.000
corrplot(as.matrix(Harman23DF), is.corr = TRUE, method = "square")

corrplot(as.matrix(Harman23DF), is.corr = TRUE, method = "square", diag = FALSE)

testHarman23DF = cor.mtest(Harman23DF, conf.level = 0.95)
testHarman23DF
## $p
##                      height     arm.span      forearm    lower.leg      weight
## height         0.0000000000 4.579308e-04 8.728689e-04 0.0001847906 0.027700238
## arm.span       0.0004579308 0.000000e+00 4.333368e-05 0.0006216409 0.004772576
## forearm        0.0008728689 4.333368e-05 0.000000e+00 0.0009913897 0.008574570
## lower.leg      0.0001847906 6.216409e-04 9.913897e-04 0.0000000000 0.018631694
## weight         0.0277002379 4.772576e-03 8.574570e-03 0.0186316940 0.000000000
## bitro.diameter 0.0317509548 9.406004e-03 1.483709e-02 0.0123784231 0.009993784
## chest.girth    0.0059682247 2.779410e-03 2.601843e-03 0.0121484562 0.016784732
## chest.width    0.0448369232 7.425068e-02 4.805126e-02 0.0398602049 0.189857892
##                bitro.diameter chest.girth chest.width
## height            0.031750955 0.005968225  0.04483692
## arm.span          0.009406004 0.002779410  0.07425068
## forearm           0.014837090 0.002601843  0.04805126
## lower.leg         0.012378423 0.012148456  0.03986020
## weight            0.009993784 0.016784732  0.18985789
## bitro.diameter    0.000000000 0.104803965  0.21853563
## chest.girth       0.104803965 0.000000000  0.26562514
## chest.width       0.218535632 0.265625144  0.00000000
## 
## $lowCI
##                    height   arm.span    forearm  lower.leg     weight
## height          1.0000000  0.7075883  0.6465730  0.7776590 -0.9544454
## arm.span        0.7075883  1.0000000  0.8584361  0.6799248 -0.9765496
## forearm         0.6465730  0.8584361  1.0000000  0.6333185 -0.9709085
## lower.leg       0.7776590  0.6799248  0.6333185  1.0000000 -0.9609777
## weight         -0.9544454 -0.9765496 -0.9709085 -0.9609777  1.0000000
## bitro.diameter -0.9519096 -0.9698878 -0.9642406 -0.9666167  0.3147889
## chest.girth    -0.9745527 -0.9807106 -0.9811615 -0.9668523  0.2221893
## chest.width    -0.9447176 -0.9317223 -0.9431240 -0.9473045 -0.2956608
##                bitro.diameter chest.girth chest.width
## height             -0.9519096  -0.9745527  -0.9447176
## arm.span           -0.9698878  -0.9807106  -0.9317223
## forearm            -0.9642406  -0.9811615  -0.9431240
## lower.leg          -0.9666167  -0.9668523  -0.9473045
## weight              0.3147889   0.2221893  -0.2956608
## bitro.diameter      1.0000000  -0.1586043  -0.3287749
## chest.girth        -0.1586043   1.0000000  -0.3750832
## chest.width        -0.3287749  -0.3750832   1.0000000
## 
## $uppCI
##                     height    arm.span     forearm   lower.leg     weight
## height          1.00000000  0.98977320  0.98719781  0.99252006 -0.1257605
## arm.span        0.98977320  1.00000000  0.99543787  0.98862792 -0.4333915
## forearm         0.98719781  0.99543787  1.00000000  0.98661384 -0.3406765
## lower.leg       0.99252006  0.98862792  0.98661384  1.00000000 -0.2026428
## weight         -0.12576053 -0.43339147 -0.34067649 -0.20264283  1.0000000
## bitro.diameter -0.09837645 -0.32511523 -0.24489920 -0.27749097  0.9691978
## chest.girth    -0.39920171 -0.51003849 -0.51881885 -0.28081167  0.9625137
## chest.width    -0.02715660  0.08157119 -0.01254391 -0.05174479  0.8953537
##                bitro.diameter chest.girth chest.width
## height            -0.09837645  -0.3992017 -0.02715660
## arm.span          -0.32511523  -0.5100385  0.08157119
## forearm           -0.24489920  -0.5188189 -0.01254391
## lower.leg         -0.27749097  -0.2808117 -0.05174479
## weight             0.96919778   0.9625137  0.89535365
## bitro.diameter     1.00000000   0.9206218  0.88783153
## chest.girth        0.92062183   1.0000000  0.87609576
## chest.width        0.88783153   0.8760958  1.00000000
#could not get this to work perfectly
corrplot(as.matrix(Harman23DF), is.corr = TRUE, method = "square", diag = FALSE, p.mat = testHarman23DF$p, sig.level = 0.05)

Main problem was reformatting original dataset to work with the corrplot function

mtcars

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
library(ggcorrplot)

?datasets library(help = “datasets”) ?mtcars

The data was extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles (1973–74 models).

mtcars head(mtcars)

Only would do this if I want to export file to csv so I can manipulate it to work more easily with corrplot / renaming getwd() write.csv(mtcars,“C:/Users/Siggi/Downloads/mtcars.csv”, row.names = FALSE)

renaming columns

mtcarsnew <- rename(mtcars, "# Gears" = gear, "# Carburetors" = carb, Transmission = am, 
                    Engine = vs, "1/4 Mile Time" = qsec, Weight = wt, "Rear Axle Ratio" = drat, 
                    Horsepower = hp, Displacement = disp, "# Cylinders" = cyl, MPG = mpg)

mtcarsnew

class(mtcarsnew) #data frame
## [1] "data.frame"
mtcarsCOR = cor(mtcarsnew)
mtcarsCOR
##                        MPG # Cylinders Displacement Horsepower Rear Axle Ratio
## MPG              1.0000000  -0.8521620   -0.8475514 -0.7761684      0.68117191
## # Cylinders     -0.8521620   1.0000000    0.9020329  0.8324475     -0.69993811
## Displacement    -0.8475514   0.9020329    1.0000000  0.7909486     -0.71021393
## Horsepower      -0.7761684   0.8324475    0.7909486  1.0000000     -0.44875912
## Rear Axle Ratio  0.6811719  -0.6999381   -0.7102139 -0.4487591      1.00000000
## Weight          -0.8676594   0.7824958    0.8879799  0.6587479     -0.71244065
## 1/4 Mile Time    0.4186840  -0.5912421   -0.4336979 -0.7082234      0.09120476
## Engine           0.6640389  -0.8108118   -0.7104159 -0.7230967      0.44027846
## Transmission     0.5998324  -0.5226070   -0.5912270 -0.2432043      0.71271113
## # Gears          0.4802848  -0.4926866   -0.5555692 -0.1257043      0.69961013
## # Carburetors   -0.5509251   0.5269883    0.3949769  0.7498125     -0.09078980
##                     Weight 1/4 Mile Time     Engine Transmission    # Gears
## MPG             -0.8676594    0.41868403  0.6640389   0.59983243  0.4802848
## # Cylinders      0.7824958   -0.59124207 -0.8108118  -0.52260705 -0.4926866
## Displacement     0.8879799   -0.43369788 -0.7104159  -0.59122704 -0.5555692
## Horsepower       0.6587479   -0.70822339 -0.7230967  -0.24320426 -0.1257043
## Rear Axle Ratio -0.7124406    0.09120476  0.4402785   0.71271113  0.6996101
## Weight           1.0000000   -0.17471588 -0.5549157  -0.69249526 -0.5832870
## 1/4 Mile Time   -0.1747159    1.00000000  0.7445354  -0.22986086 -0.2126822
## Engine          -0.5549157    0.74453544  1.0000000   0.16834512  0.2060233
## Transmission    -0.6924953   -0.22986086  0.1683451   1.00000000  0.7940588
## # Gears         -0.5832870   -0.21268223  0.2060233   0.79405876  1.0000000
## # Carburetors    0.4276059   -0.65624923 -0.5696071   0.05753435  0.2740728
##                 # Carburetors
## MPG               -0.55092507
## # Cylinders        0.52698829
## Displacement       0.39497686
## Horsepower         0.74981247
## Rear Axle Ratio   -0.09078980
## Weight             0.42760594
## 1/4 Mile Time     -0.65624923
## Engine            -0.56960714
## Transmission       0.05753435
## # Gears            0.27407284
## # Carburetors      1.00000000
ggcorrplot(mtcarsCOR, hc.order = TRUE, outline.col = "white")

corrplot(mtcarsCOR, is.corr = TRUE, method = "square", diag = FALSE, order = 'AOE', addCoef.col = "gray")

testmtcarsCOR = cor.mtest(mtcarsCOR, conf.level = 0.95)
testmtcarsCOR
## $p
##                          MPG  # Cylinders Displacement   Horsepower
## MPG             0.000000e+00 4.037623e-09 1.091445e-09 4.322152e-06
## # Cylinders     4.037623e-09 0.000000e+00 1.659424e-09 5.625666e-07
## Displacement    1.091445e-09 1.659424e-09 0.000000e+00 1.304240e-05
## Horsepower      4.322152e-06 5.625666e-07 1.304240e-05 0.000000e+00
## Rear Axle Ratio 1.780708e-05 5.174268e-05 4.935086e-06 2.049276e-03
## Weight          2.518729e-08 1.320552e-06 1.901561e-08 1.787328e-04
## 1/4 Mile Time   1.471451e-02 5.949087e-03 1.829871e-02 3.509603e-04
## Engine          3.020659e-05 2.026977e-06 3.450779e-05 3.690380e-08
## Transmission    1.704116e-03 4.379896e-03 1.122485e-03 3.593445e-02
## # Gears         5.780693e-03 9.922460e-03 3.028291e-03 7.040180e-02
## # Carburetors   3.193816e-03 2.006581e-03 6.969729e-03 4.718601e-05
##                 Rear Axle Ratio       Weight 1/4 Mile Time       Engine
## MPG                1.780708e-05 2.518729e-08  1.471451e-02 3.020659e-05
## # Cylinders        5.174268e-05 1.320552e-06  5.949087e-03 2.026977e-06
## Displacement       4.935086e-06 1.901561e-08  1.829871e-02 3.450779e-05
## Horsepower         2.049276e-03 1.787328e-04  3.509603e-04 3.690380e-08
## Rear Axle Ratio    0.000000e+00 3.782478e-07  1.426220e-01 3.468419e-03
## Weight             3.782478e-07 0.000000e+00  5.627606e-02 4.608706e-04
## 1/4 Mile Time      1.426220e-01 5.627606e-02  0.000000e+00 1.673365e-04
## Engine             3.468419e-03 4.608706e-04  1.673365e-04 0.000000e+00
## Transmission       1.229604e-05 1.431601e-04  5.473798e-01 5.385274e-02
## # Gears            6.018579e-05 7.977054e-04  6.815004e-01 8.393306e-02
## # Carburetors      6.998155e-02 1.713296e-02  8.438162e-06 9.355873e-05
##                 Transmission      # Gears # Carburetors
## MPG             1.704116e-03 5.780693e-03  3.193816e-03
## # Cylinders     4.379896e-03 9.922460e-03  2.006581e-03
## Displacement    1.122485e-03 3.028291e-03  6.969729e-03
## Horsepower      3.593445e-02 7.040180e-02  4.718601e-05
## Rear Axle Ratio 1.229604e-05 6.018579e-05  6.998155e-02
## Weight          1.431601e-04 7.977054e-04  1.713296e-02
## 1/4 Mile Time   5.473798e-01 6.815004e-01  8.438162e-06
## Engine          5.385274e-02 8.393306e-02  9.355873e-05
## Transmission    0.000000e+00 1.702701e-07  3.055664e-01
## # Gears         1.702701e-07 0.000000e+00  4.848884e-01
## # Carburetors   3.055664e-01 4.848884e-01  0.000000e+00
## 
## $lowCI
##                        MPG # Cylinders Displacement Horsepower Rear Axle Ratio
## MPG              1.0000000  -0.9976823   -0.9982698 -0.9888091       0.7778575
## # Cylinders     -0.9976823   1.0000000    0.9700514  0.8931885      -0.9801076
## Displacement    -0.9982698   0.9700514    1.0000000  0.7918066      -0.9884636
## Horsepower      -0.9888091   0.8931885    0.7918066  1.0000000      -0.9514038
## Rear Axle Ratio  0.7778575  -0.9801076   -0.9884636 -0.9514038       1.0000000
## Weight          -0.9965075   0.8717271    0.9488324  0.6445032      -0.9935715
## 1/4 Mile Time    0.1885083  -0.9359355   -0.9129200 -0.9686117      -0.1782837
## Engine           0.7521771  -0.9905881   -0.9819081 -0.9961948       0.3723792
## Transmission     0.4497653  -0.9408983   -0.9582352 -0.8941274       0.7943524
## # Gears          0.3115428  -0.9265023   -0.9462972 -0.8698401       0.7145584
## # Carburetors   -0.9455549   0.4327021    0.2881435  0.7283968      -0.8700876
##                     Weight 1/4 Mile Time       Engine Transmission     # Gears
## MPG             -0.9965075     0.1885083  0.752177112  0.449765308  0.31154285
## # Cylinders      0.8717271    -0.9359355 -0.990588096 -0.940898308 -0.92650229
## Displacement     0.9488324    -0.9129200 -0.981908097 -0.958235185 -0.94629724
## Horsepower       0.6445032    -0.9686117 -0.996194795 -0.894127367 -0.86984015
## Rear Axle Ratio -0.9935715    -0.1782837  0.372379155  0.794352378  0.71455837
## Weight           1.0000000    -0.8786800 -0.966462744 -0.974695144 -0.96162988
## 1/4 Mile Time   -0.8786800     1.0000000  0.649145621 -0.451068301 -0.50207431
## Engine          -0.9664627     0.6491456  1.000000000 -0.008706491 -0.08354933
## Transmission    -0.9746951    -0.4510683 -0.008706491  1.000000000  0.91749704
## # Gears         -0.9616299    -0.5020743 -0.083549333  0.917497040  1.00000000
## # Carburetors    0.1669910    -0.9869495 -0.977126817 -0.780865674 -0.73218796
##                 # Carburetors
## MPG                -0.9455549
## # Cylinders         0.4327021
## Displacement        0.2881435
## Horsepower          0.7283968
## Rear Axle Ratio    -0.8700876
## Weight              0.1669910
## 1/4 Mile Time      -0.9869495
## Engine             -0.9771268
## Transmission       -0.7808657
## # Gears            -0.7321880
## # Carburetors       1.0000000
## 
## $uppCI
##                        MPG # Cylinders Displacement  Horsepower Rear Axle Ratio
## MPG              1.0000000  -0.9635787   -0.9726924 -0.83492892      0.98449032
## # Cylinders     -0.9635787   1.0000000    0.9981001  0.99296695     -0.72322813
## Displacement    -0.9726924   0.9981001    1.0000000  0.98556954     -0.83023726
## Horsepower      -0.8349289   0.9929670    0.9855695  1.00000000     -0.43047203
## Rear Axle Ratio  0.9844903  -0.7232281   -0.8302373 -0.43047203      1.00000000
## Weight          -0.9455873   0.9914634    0.9967209  0.97331807     -0.90194717
## 1/4 Mile Time    0.9180840  -0.3079903   -0.1575524 -0.59373847      0.83538327
## Engine           0.9824617  -0.8594408   -0.7452884 -0.94085090      0.94438092
## Transmission     0.9536222  -0.3451431   -0.4914487 -0.05617533      0.98576482
## # Gears          0.9364210  -0.2422846   -0.3877983  0.05343045      0.97938823
## # Carburetors   -0.3817880   0.9516630    0.9331780  0.98053328      0.05241567
##                      Weight 1/4 Mile Time     Engine Transmission     # Gears
## MPG             -0.94558734    0.91808400  0.9824617   0.95362221  0.93642103
## # Cylinders      0.99146342   -0.30799033 -0.8594408  -0.34514312 -0.24228456
## Displacement     0.99672088   -0.15755245 -0.7452884  -0.49144868 -0.38779827
## Horsepower       0.97331807   -0.59373847 -0.9408509  -0.05617533  0.05343045
## Rear Axle Ratio -0.90194717    0.83538327  0.9443809   0.98576482  0.97938823
## Weight           1.00000000    0.01595598 -0.5715107  -0.65992608 -0.52355311
## 1/4 Mile Time    0.01595598    1.00000000  0.9737351   0.71623124  0.68252616
## Engine          -0.57151075    0.97373509  1.0000000   0.88032208  0.86227799
## Transmission    -0.65992608    0.71623124  0.8803221   1.00000000  0.99463195
## # Gears         -0.52355311    0.68252616  0.8622780   0.99463195  1.00000000
## # Carburetors    0.91452061   -0.80994159 -0.6878237   0.32597570  0.42393232
##                 # Carburetors
## MPG               -0.38178798
## # Cylinders        0.95166303
## Displacement       0.93317802
## Horsepower         0.98053328
## Rear Axle Ratio    0.05241567
## Weight             0.91452061
## 1/4 Mile Time     -0.80994159
## Engine            -0.68782370
## Transmission       0.32597570
## # Gears            0.42393232
## # Carburetors      1.00000000
corrplot(mtcarsCOR, is.corr = TRUE, method = "square", diag = FALSE, order = 'AOE', addCoef.col = "gray", p.mat = testmtcarsCOR$p, sig.level = 0.05)

Indometh

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
library(ggcorrplot)

?datasets library(help = “datasets”) ?Indometh

The Indometh data frame has 66 rows and 3 columns of data on the pharmacokinetics of indometacin (or, older spelling, ‘indomethacin’).

Indometh head(Indometh)

If I want to view raw csv write.csv(Indometh,“C:/Users/Siggi/Downloads/Indometh.csv”, row.names = FALSE)

class(Indometh) #grouped data AND data frame

boxplot(time ~ Subject, data = Indometh)

boxplot(conc ~ Subject, data = Indometh)

ggplot(Indometh, aes(x=time, y=conc, color=Subject)) + 
  geom_smooth()+
  geom_point()+
  labs(title="Indometacin Plasma Concentration vs. Time", x = "Time (hr)", y = "Plasma Concentrations of Indometacin (mcg/ml)",  
       caption="The Indometh data frame has 66 rows and 3 columns of data on the pharmacokinetics of indometacin")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

library(plotly)
plot_ly(data = Indometh, x=~time, y=~conc, type = 'scatter', mode = 'lines', color=~Subject) %>% 
  arrange(Subject) %>% 
  layout(title="Indometacin Plasma Concentration vs. Time", 
         xaxis = list(title = "Time (hr)"), 
         yaxis = list(title = "Plasma Concentrations of Indometacin (mcg/ml)"),
         legend=list(title=list(text='<b> Subject </b>')),
         annotations = list(x = 1.0, y = -0.1, showarrow = F, xref='paper', yref='paper', 
         xanchor='right', yanchor='auto', xshift=0, yshift=0, text = "The Indometh data frame has 66 rows and 3 columns of             data on the pharmacokinetics of indometacin"))

InsectSprays

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
library(ggcorrplot)

?datasets library(help = “datasets”) ?InsectSprays

The counts of insects in agricultural experimental units treated with different insecticides.

InsectSprays head(InsectSprays)

If I want to view raw csv write.csv(InsectSprays,“C:/Users/Siggi/Downloads/InsectSprays.csv”, row.names = FALSE)

class(InsectSprays) #data frame

#Either method for boxplot 1.)
ggplot(InsectSprays, aes(x=spray, y=count))+
  geom_boxplot()

#2.)
boxplot(count ~ spray, data = InsectSprays,
        xlab = "Type of spray", ylab = "Insect count",
        main = "InsectSprays data", varwidth = TRUE, col = "lightgray")

ANOVAsprays <- aov(count ~ spray, data = InsectSprays)
summary(ANOVAsprays) #Reject null hypothesis. P-value less than 0.05. There is a sig difference.
##             Df Sum Sq Mean Sq F value Pr(>F)    
## spray        5   2669   533.8    34.7 <2e-16 ***
## Residuals   66   1015    15.4                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(ANOVAsprays) #same thing as using summary
## Analysis of Variance Table
## 
## Response: count
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## spray      5 2668.8  533.77  34.702 < 2.2e-16 ***
## Residuals 66 1015.2   15.38                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(x=InsectSprays$count, g=InsectSprays$spray, p.adj="bonf")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  InsectSprays$count and InsectSprays$spray 
## 
##   A       B       C       D       E      
## B 1       -       -       -       -      
## C 1.1e-09 1.3e-10 -       -       -      
## D 1.5e-06 1.8e-07 1       -       -      
## E 4.1e-08 4.9e-09 1       1       -      
## F 1       1       4.2e-12 6.1e-09 1.6e-10
## 
## P value adjustment method: bonferroni
plot(ANOVAsprays)

#Using SQRT to satisfy ANOVA conditions and ensure results are more accurate
ggplot(InsectSprays, aes(x = spray, y = sqrt(count))) + 
  geom_boxplot()

boxplot(sqrt(count) ~ spray, data = InsectSprays,
        xlab = "Type of spray", ylab = "Insect count",
        main = "InsectSprays data", varwidth = TRUE, col = "lightgray")

ANOVAspraysSQRT <- aov(sqrt(count) ~ spray, data = InsectSprays)
summary(ANOVAspraysSQRT)
##             Df Sum Sq Mean Sq F value Pr(>F)    
## spray        5  88.44  17.688    44.8 <2e-16 ***
## Residuals   66  26.06   0.395                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(ANOVAspraysSQRT)
## Analysis of Variance Table
## 
## Response: sqrt(count)
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## spray      5 88.438 17.6876  44.799 < 2.2e-16 ***
## Residuals 66 26.058  0.3948                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(ANOVAspraysSQRT)

t.test(InsectSprays$count[InsectSprays$spray == 'A'], InsectSprays$count[InsectSprays$spray == 'B']) #not significant
## 
##  Welch Two Sample t-test
## 
## data:  InsectSprays$count[InsectSprays$spray == "A"] and InsectSprays$count[InsectSprays$spray == "B"]
## t = -0.45352, df = 21.784, p-value = 0.6547
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.646182  2.979515
## sample estimates:
## mean of x mean of y 
##  14.50000  15.33333
t.test(InsectSprays$count[InsectSprays$spray == 'A'], InsectSprays$count[InsectSprays$spray == 'C']) #significant
## 
##  Welch Two Sample t-test
## 
## data:  InsectSprays$count[InsectSprays$spray == "A"] and InsectSprays$count[InsectSprays$spray == "C"]
## t = 8.4073, df = 14.739, p-value = 5.278e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   9.263901 15.569433
## sample estimates:
## mean of x mean of y 
## 14.500000  2.083333
t.test(InsectSprays$count[InsectSprays$spray == 'A'], InsectSprays$count[InsectSprays$spray == 'D']) #significant
## 
##  Welch Two Sample t-test
## 
## data:  InsectSprays$count[InsectSprays$spray == "A"] and InsectSprays$count[InsectSprays$spray == "D"]
## t = 6.2144, df = 16.735, p-value = 1.012e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   6.325795 12.840871
## sample estimates:
## mean of x mean of y 
## 14.500000  4.916667
t.test(InsectSprays$count[InsectSprays$spray == 'A'], InsectSprays$count[InsectSprays$spray == 'E']) #significant
## 
##  Welch Two Sample t-test
## 
## data:  InsectSprays$count[InsectSprays$spray == "A"] and InsectSprays$count[InsectSprays$spray == "E"]
## t = 7.5798, df = 13.91, p-value = 2.655e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   7.885546 14.114454
## sample estimates:
## mean of x mean of y 
##      14.5       3.5
t.test(InsectSprays$count[InsectSprays$spray == 'A'], InsectSprays$count[InsectSprays$spray == 'F']) #not significant
## 
##  Welch Two Sample t-test
## 
## data:  InsectSprays$count[InsectSprays$spray == "A"] and InsectSprays$count[InsectSprays$spray == "F"]
## t = -0.96194, df = 20.523, p-value = 0.3473
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.857396  2.524063
## sample estimates:
## mean of x mean of y 
##  14.50000  16.66667

JohnsonJohnson

Loading Data / Basic Plotting Tutorial

library(ggplot2) #qplot (basic scatterplot) 
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
library(ggcorrplot)
library(tseries) #for autoplot with geom_smooth linear regression
library(forecast)

?datasets library(help = “datasets”) ?JohnsonJohnson

Quarterly earnings (dollars) per Johnson & Johnson share 1960–80.

Sources: https://rpubs.com/scarden/757860 https://towardsdatascience.com/time-series-forecasting-in-r-with-holt-winters-16ef9ebdb6c0 https://www.statology.org/ljung-box-test/ https://www.r-bloggers.com/2012/07/holt-winters-forecast-using-ggplot2/

JohnsonJohnson head(JohnsonJohnson) str(JohnsonJohnson)

If I want to view raw csv write.csv(JohnsonJohnson,“C:/Users/Siggi/Downloads/JohnsonJohnson.csv”, row.names = FALSE)

class(JohnsonJohnson) #timeseries

plot(JohnsonJohnson, ylab = 'Earnings', main = 'Quarterly Earnings of Johnson & Johnson')

#the log will reduce variance
plot(log(JohnsonJohnson), ylab = 'Log of Earnings', main = 'Log Transformed Quarterly Earnings of Johnson & Johnson') 

#If I want to use ggplot you have to switch to data frame and make Date numeric column
JohnsonJohnsonDF<-as.data.frame(JohnsonJohnson) 
JohnsonJohnsonDF$Date <- as.numeric(time(JohnsonJohnson)) 
JohnsonJohnsonDFlog <- log(JohnsonJohnsonDF$x)
JohnsonJohnsonDFlog
##             Qtr1        Qtr2        Qtr3        Qtr4
## 1960 -0.34249031 -0.46203546 -0.16251893 -0.82098055
## 1961 -0.49429632 -0.37106368 -0.08338161 -0.59783700
## 1962 -0.32850407 -0.26136476 -0.08338161 -0.51082562
## 1963 -0.18632958 -0.22314355  0.00000000 -0.26136476
## 1964 -0.08338161  0.00000000  0.21511138  0.00000000
## 1965  0.14842001  0.26236426  0.37156356  0.22314355
## 1966  0.23111172  0.32208350  0.62057649  0.44468582
## 1967  0.42526774  0.46373402  0.60431597  0.62057649
## 1968  0.42526774  0.72754861  0.85015093  0.81093022
## 1969  0.77010822  0.88789126  0.99325177  0.81093022
## 1970  1.02604160  1.22964055  1.30562646  1.28093385
## 1971  1.28093385  1.46325540  1.46325540  1.39871688
## 1972  1.58103844  1.61740608  1.61740608  1.48387469
## 1973  1.71918878  1.76644166  1.88251383  1.66959184
## 1974  1.79674701  1.85473427  1.93585981  1.76644166
## 1975  1.93585981  2.04640169  2.05796251  1.81156210
## 1976  2.04640169  2.18717424  2.11384297  1.92278773
## 1977  2.25549349  2.32825284  2.25549349  2.16676537
## 1978  2.47485631  2.48989419  2.49732917  2.18717424
## 1979  2.64191040  2.56186769  2.69799987  2.30158459
## 1980  2.78501124  2.68580459  2.77383794  2.45186680
ggplot(JohnsonJohnsonDF, aes(x=Date, y=JohnsonJohnson))+
  geom_line()
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

ggplot(JohnsonJohnsonDF, aes(x=Date, y=JohnsonJohnsonDFlog))+
  geom_line()
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

plot(decompose(log(JohnsonJohnson)))

JJHW1 <- HoltWinters(JohnsonJohnson)
JJHW1
## Holt-Winters exponential smoothing with trend and additive seasonal component.
## 
## Call:
## HoltWinters(x = JohnsonJohnson)
## 
## Smoothing parameters:
##  alpha: 0.06170717
##  beta : 1
##  gamma: 1
## 
## Coefficients:
##          [,1]
## a  13.7617605
## b   0.3993408
## s1  3.6771275
## s2  1.7105925
## s3  2.6575803
## s4 -2.1517605
#Custom HoltWinters fitting
JJHW2 <- HoltWinters(JohnsonJohnson, alpha=0.2, beta=0.1, gamma=0.1)
JJHW2
## Holt-Winters exponential smoothing with trend and additive seasonal component.
## 
## Call:
## HoltWinters(x = JohnsonJohnson, alpha = 0.2, beta = 0.1, gamma = 0.1)
## 
## Smoothing parameters:
##  alpha: 0.2
##  beta : 0.1
##  gamma: 0.1
## 
## Coefficients:
##          [,1]
## a  14.3540610
## b   0.3479427
## s1  0.8452120
## s2  0.6557786
## s3  0.7552920
## s4 -0.9115116
#Visually evaluate the fits
plot(JohnsonJohnson, ylab="value", xlim=c(1960,1981))
lines(JJHW1$fitted[,1], lty=2, col="blue")
lines(JJHW2$fitted[,1], lty=2, col="red")

#Predicting 24 months into future with 95% confidence. 
JJHW1.pred <- predict(JJHW1, 24, prediction.interval = TRUE, level=0.95)
#Visually evaluate the prediction
plot(JohnsonJohnson, ylab="value", xlim=c(1960,1983), ylim=c(0,25))
lines(JJHW1$fitted[,1], lty=2, col="blue")
lines(JJHW1.pred[,1], col="red")
lines(JJHW1.pred[,2], lty=2, col="brown")
lines(JJHW1.pred[,3], lty=2, col="brown")

plot(JJHW1, JJHW1.pred)

#Seasonality prediction 
JJHW3 <- HoltWinters(JohnsonJohnson, seasonal = "multiplicative")
JJHW3.pred <- predict(JJHW3, 24, prediction.interval = TRUE, level=0.95)
plot(JohnsonJohnson, ylab="value", xlim=c(1960,1983), ylim=c(0,25))
lines(JJHW3$fitted[,1], lty=2, col="blue")
lines(JJHW3.pred[,1], col="red")
lines(JJHW3.pred[,2], lty=2, col="brown")
lines(JJHW3.pred[,3], lty=2, col="brown")

#Using forecast (similar to ARIMA)
JJHW1_forecast <- forecast(JJHW1, h=12, level=c(80,95))
plot(JJHW1_forecast, xlim=c(1960,1983))
lines(JJHW1_forecast$fitted, lty=2, col="red")

#acf bars should be within blue bars if there is correlation of fit residuals. acf(JJHW1_forecast\(residuals, lag.max=20, na.action=na.pass) #Ideally, we would like to fail to reject the null hypothesis. That is, #we would like to see the p-value of the test be greater than 0.05 because #this means the residuals for our time series model are independent, which #is often an assumption we make when creating a model. Box.test(JJHW1_forecast\)residuals, lag=20, type=“Ljung-Box”) #no skew is good! hist(JJHW1_forecast$residuals)









ARIMA METHOD


```r
#ARIMA is an acronym for “autoregressive integrated moving average.” It's a model used in statistics and econometrics to measure events that happen over a period of time.
arimaJohnsonJohnson <- auto.arima(JohnsonJohnson)
arimaJohnsonJohnson
## Series: JohnsonJohnson 
## ARIMA(3,1,1)(0,1,0)[4] 
## 
## Coefficients:
##           ar1     ar2     ar3      ma1
##       -0.1712  0.1387  -0.208  -0.6636
## s.e.   0.1769  0.1701   0.121   0.1542
## 
## sigma^2 = 0.1808:  log likelihood = -43.01
## AIC=96.02   AICc=96.84   BIC=107.86
#if the residuals plot is around - w/ no movement, then ARIMA model is a good fit
ggtsdiag(arimaJohnsonJohnson)

#95% confidence looking 10 months into the future
forecastarimaJohnsonJohnson <- forecast(arimaJohnsonJohnson, level = c(80,95), h = 12)
autoplot(forecastarimaJohnsonJohnson)